This notebook contains some code to process and normalize the lexical information appearing in CodeMethod
comments and implementations
(i.e., CodeMethod.comment
and CodeMethod.code
, respectively).
The overall processing encompasses the following steps:
nltk
)Once those processing steps are completed, the jaccard_coefficient
is computed between code and comments of each method, and all the analysis information are then stored in a CodeLexiconInfo
model instance).
This notebook requires Python 3
In [2]:
%load preamble_directives.py
In [3]:
from source_code_analysis.models import CodeLexiconInfo
In [ ]:
from lexical_analysis import LINSENnormalizer
In [5]:
from lexical_analysis import LexicalAnalyzer
In [5]:
from source_code_analysis.models import SoftwareProject
target_sw_project = SoftwareProject.objects.get(name__iexact='CoffeeMaker')
In [6]:
# Use RelatedManager to get all the code methods associated to the target project
code_methods = target_sw_project.code_methods.all()
In [10]:
total_methods = code_methods.count()
coefficients = list()
for i, method in enumerate(code_methods):
print('Analyzing Method {0} out of {1}: {2}'.format(i+1, total_methods, method.method_name))
analyzer = LexicalAnalyzer(method)
analyzer.analyse_textual_information()
coefficients.append(analyzer.code_lexical_info.jaccard_coeff)
In [4]:
from scipy import median
from scipy import mean
from scipy import var, std
import numpy as np
In [5]:
from source_code_analysis.models import SoftwareProject
projects = list()
projects.append(SoftwareProject.objects.get(name__iexact='CoffeeMaker', version__exact='1.0'))
projects.append(SoftwareProject.objects.get(name__iexact='Jfreechart', version__exact='0.6.0'))
projects.append(SoftwareProject.objects.get(name__iexact='Jfreechart', version__exact='0.7.1'))
projects.append(SoftwareProject.objects.get(name__iexact='JHotDraw', version__exact='7.4.1'))
print(projects)
In [8]:
for project in projects:
code_methods = project.code_methods.all()
coefficients = list()
for method in code_methods:
# Check that this method has no "wrong_association"
n_evaluations = method.agreement_evaluations.count()
n_eval_wrong_assocation = method.agreement_evaluations.filter(wrong_association=True).count()
if n_evaluations == n_eval_wrong_assocation:
# if **all** the evaluations for the current method mark it as a wrong_association
# exclude it from the statistics
continue
clexicon_info = method.lexical_info
coefficients.append(clexicon_info.jaccard_coeff)
coeff = np.array(coefficients)
print('{proj} ({ver}) & {total} & {min:.3} & {max:.3} & {median:.3} & {mean:.3} & {variance:.3} & {devstd:.3} \\\\'.format(
proj = project.name.title(), ver=project.version,
total=coeff.size, min=coeff.min(), max=coeff.max(),
median=median(coeff), mean=coeff.mean(),
variance=var(coeff), devstd=std(coeff)))
In [21]:
# Import Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
for project in projects:
# Populate the Doc Collection
document_collection = list()
# Get Methods
code_methods = project.code_methods.all()
for method in code_methods:
# Check that this method has no "wrong_association"
n_evaluations = method.agreement_evaluations.count()
n_eval_wrong_assocation = method.agreement_evaluations.filter(wrong_association=True).count()
if n_evaluations == n_eval_wrong_assocation:
# if **all** the evaluations for the current method mark it as a wrong_association
# exclude it from the statistics
continue
clexicon_info = method.lexical_info
document_collection.append(clexicon_info.normalized_comment)
document_collection.append(clexicon_info.normalized_code)
vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
tfidf_values = vectorizer.fit_transform(document_collection)
#cosine_sim_vals = list()
#rows, cols = tfidf_values.shape
#for i in range(0, rows, 2):
# cosine_sim_vals.append(tfidf_values[i].dot(tfidf_values[i+1].T)[0,0])
#cosine_sim_vals = np.array(cosine_sim_vals)
comments, code = tfidf_values[::2], tfidf_values[1::2]
kernel_matrix = linear_kernel(comments, code) # arrays are still L2 (length) normalized
cosine_sim_vals = np.diag(kernel_matrix)
print('{proj} ({ver}) & {tot} & {min:.3} & {max:.3} & {med:.3} & {mu:.3} & {var:.3} & {sigma:.3} \\\\'.format(
proj=project.name.title(), ver=project.version, tot=cosine_sim_vals.size, min=cosine_sim_vals.min(),
max=cosine_sim_vals.max(), med=median(cosine_sim_vals), mu=cosine_sim_vals.mean(),
var=var(cosine_sim_vals), sigma=std(cosine_sim_vals)))
In [6]:
coff_maker = projects[0]
methods = coff_maker.code_methods.all()
methods = methods[0:2]
docs = list()
for method in methods:
lex_info = method.lexical_info
docs.append(lex_info.normalized_comment)
docs.append(lex_info.normalized_code)
print('Methods: ', len(methods))
print('Docs: ', len(docs))
In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
X = vectorizer.fit_transform(docs)
In [14]:
vectorizer.get_feature_names()
Out[14]:
In [21]:
x = X[0].toarray()
from scipy.sparse import issparse
print(issparse(x))
In [30]:
x = x.ravel()
In [31]:
np.where(x>0)
Out[31]:
In [33]:
np.take(x, np.where(x>0))
Out[33]:
In [34]:
x[np.where(x>0)]
Out[34]:
In [35]:
print(vectorizer.get_feature_names())
In [36]:
docs[0]
Out[36]:
In [40]:
jhotdraw = projects[-1]
methods = jhotdraw.code_methods.all()
methods = methods[0:2]
docs = list()
for method in methods:
lex_info = method.lexical_info
docs.append(lex_info.normalized_comment)
docs.append(lex_info.normalized_code)
print('Methods: ', len(methods))
print('Docs: ', len(docs))
In [42]:
docs[0], docs[1]
Out[42]:
In [44]:
methods[0].lexical_info.normalized_comment
Out[44]:
In [45]:
methods[0].lexical_info.normalized_code
Out[45]:
In [46]:
methods[0].example.target
Out[46]:
In [19]:
# Import Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer
## TODO: See the following "Optimization" subsections to see tests
from sklearn.metrics.pairwise import linear_kernel # array are still L2 normalized
for project in projects:
# Get Methods
code_methods = project.code_methods.all()
# Populate the Doc Collection
document_collection = list()
for method in code_methods:
# Check that this method has no "wrong_association"
n_evaluations = method.agreement_evaluations.count()
n_eval_wrong_assocation = method.agreement_evaluations.filter(wrong_association=True).count()
if n_evaluations == n_eval_wrong_assocation:
# if **all** the evaluations for the current method mark it as a wrong_association
# exclude it from the statistics
continue
clexicon_info = method.lexical_info
document_collection.append(clexicon_info.normalized_comment)
document_collection.append(clexicon_info.normalized_code)
vectorizer = TfidfVectorizer(input='content', sublinear_tf=False, lowercase=False, use_idf=False)
tf_values = vectorizer.fit_transform(document_collection)
#cosine_sim_vals = list()
#rows, cols = tf_values.shape
#for i in range(0, rows, 2):
# cosine_sim_vals.append(tf_values[i].dot(tf_values[i+1].T)[0,0])
#cosine_sim_vals = np.array(cosine_sim_vals)
comments, code = tf_values[::2], tf_values[1::2]
kernel_matrix = linear_kernel(comments, code)
cosine_sim_vals = np.diag(kernel_matrix)
print('{proj} ({ver}) & {total} & {min:.3} & {max:.3} & {median:.3} & {mean:.3} & {variance:.3} & {devstd:.3} \\\\'.format(
proj = project.name.title(), ver=project.version,
total=cosine_sim_vals.size,
min=cosine_sim_vals.min(),
max=cosine_sim_vals.max(),
median=median(cosine_sim_vals),
mean=cosine_sim_vals.mean(),
variance=var(cosine_sim_vals),
devstd=std(cosine_sim_vals)))
Trying to optimize the cosine_similarity
computation replacing the cosine_sim_vals
list
(try using np.vstack
)
In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Target Project (as this is just an example)
project = projects[0]
# Get Methods
code_methods = project.code_methods.all()
# Populate the Doc Collection
document_collection = list()
for method in code_methods:
clexicon_info = method.lexical_info
document_collection.append(clexicon_info.normalized_comment)
document_collection.append(clexicon_info.normalized_code)
vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
tfidf_values = vectorizer.fit_transform(document_collection)
rows, cols = tfidf_values.shape
cosine_sim_vals = tfidf_values[0].dot(tfidf_values[1].T)[0,0]
for i in range(2, rows, 2):
cosine_sim_vals = np.vstack((cosine_sim_vals, tfidf_values[i].dot(tfidf_values[i+1].T)[0,0]))
cosine_sim_vals.ravel()
Out[6]:
In [7]:
alt_method = np.einsum('ij,ij->i', tfidf_values[::2,].toarray(), tfidf_values[1::2,].toarray())
alt_method
Out[7]:
In [8]:
alt_method.shape
Out[8]:
In [9]:
cosine_sim_vals.ravel().shape
Out[9]:
In [10]:
np.testing.assert_allclose(cosine_sim_vals.ravel(), alt_method)
In [11]:
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
In [12]:
comments, code = tfidf_values[::2], tfidf_values[1::2]
print(comments.shape, code.shape)
In [13]:
kernel = linear_kernel(comments, code)
np.diag(kernel)
Out[13]:
In [14]:
from numpy.testing import assert_array_almost_equal
assert_array_almost_equal(alt_method, np.diag(kernel))
In [15]:
alt_method
Out[15]:
In [16]:
cossim = cosine_similarity(comments, code)
np.diag(cossim)
Out[16]:
In [17]:
assert_array_almost_equal(alt_method, np.diag(cossim))
assert_array_almost_equal(np.diag(cossim), np.diag(kernel))
In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from evaluations import Judge
judges_combinations = (('leonardo.nole', 'rossella.linsalata'),
('leonardo.nole', 'antonio.petrone'),
('leonardo.nole', 'antonio.petrone'),
('leonardo.nole', 'rossella.linsalata'),)
CODES_Labels = ('NC', 'DK', 'CO')
from collections import defaultdict
stats_results = defaultdict(list)
for pno, project in enumerate(projects):
# Get Methods
code_methods = project.code_methods.all()
# Populate the Doc Collection
document_collection = list()
method_ids_map = dict() # Map (dict) to store the association method.pk --> Row index in Tfidf Matrix
for mno, method in enumerate(code_methods):
clexicon_info = method.lexical_info
document_collection.append(clexicon_info.normalized_comment)
document_collection.append(clexicon_info.normalized_code)
method_ids_map[method.id] = mno*2
vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
tfidf_values = vectorizer.fit_transform(document_collection)
j1_usrname, j2_usrname = judges_combinations[pno]
j1 = Judge(j1_usrname, project.name, project.version)
j2 = Judge(j2_usrname, project.name, project.version)
j1_evals = j1.three_codes_evaluations
j2_evals = j2.three_codes_evaluations
project_stats = list()
for code in range(3):
j1_evals_code = j1_evals[code]
j2_evals_code = j2_evals[code]
method_ids = j1_evals_code.intersection(j2_evals_code)
cosine_sim_vals = list()
for mid in method_ids:
i = method_ids_map[mid]
cosine_sim_vals.append(tfidf_values[i].dot(tfidf_values[i+1].T)[0,0])
cosine_sim_vals = np.array(cosine_sim_vals)
project_stats.append(cosine_sim_vals)
for code in range(3):
vals = project_stats[code]
label = CODES_Labels[code]
if vals.size > 0:
stats_results[label].append('{proj} ({ver}) & {total} & {min:.3} & {max:.3} & {median:.3} & {mean:.3} & {variance:.3} & {devstd:.3} \\\\'.format(
proj = project.name.title(),
ver=project.version,
total=vals.size,
min=vals.min(),
max=vals.max(),
median=median(vals),
mean=vals.mean(),
variance=var(vals),
devstd=std(vals)))
else:
stats_results[label].append('{proj} ({ver}) & \multicolumn{{7}}{{c|}}{{N.A.}} \\\\'.format(proj = project.name.title(),
ver=project.version))
for label in stats_results:
print('\n{0}\n'.format(label))
for value in stats_results[label]:
print(value)
In [13]:
judges_combinations = (('leonardo.nole', 'rossella.linsalata'),
('leonardo.nole', 'antonio.petrone'),
('leonardo.nole', 'antonio.petrone'),
('leonardo.nole', 'rossella.linsalata'),)
CODES_Labels = ('NC', 'DK', 'CO')
from collections import defaultdict
stats_results_paths = defaultdict(list)
pwd_out = !pwd
current_dir = pwd_out[0]
folder_path = os.path.join(current_dir, 'distributions_per_rate_tfidf')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
for pno, project in enumerate(projects):
# Get Methods
code_methods = project.code_methods.all()
# Populate the Doc Collection
document_collection = list()
method_ids_map = dict() # Map (dict) to store the association method.pk --> Row index in Tfidf Matrix
for mno, method in enumerate(code_methods):
clexicon_info = method.lexical_info
document_collection.append(clexicon_info.normalized_comment)
document_collection.append(clexicon_info.normalized_code)
method_ids_map[method.id] = mno*2
vectorizer = TfidfVectorizer(input='content', sublinear_tf=True, lowercase=False)
tfidf_values = vectorizer.fit_transform(document_collection)
j1_usrname, j2_usrname = judges_combinations[pno]
j1 = Judge(j1_usrname, project.name, project.version)
j2 = Judge(j2_usrname, project.name, project.version)
j1_evals = j1.three_codes_evaluations
j2_evals = j2.three_codes_evaluations
project_stats = list()
for code in range(3):
j1_evals_code = j1_evals[code]
j2_evals_code = j2_evals[code]
method_ids = j1_evals_code.intersection(j2_evals_code)
cosine_sim_vals = list()
for mid in method_ids:
i = method_ids_map[mid]
cosine_sim_vals.append(tfidf_values[i].dot(tfidf_values[i+1].T)[0,0])
cosine_sim_vals = np.array(cosine_sim_vals)
project_stats.append(cosine_sim_vals)
for code in range(3):
vals = project_stats[code]
label = CODES_Labels[code]
if vals.size > 0:
filename = '{label}_{proj}_({ver})_{total}.txt'.format(label=label,
proj=project.name.title(),
ver=project.version,
total=vals.size)
filepath = os.path.join(folder_path, filename)
np.savetxt(filepath, vals)
stats_results_paths[label].append(filepath)
for label in stats_results:
print('\n{0}\n'.format(label))
for path in stats_results_paths[label]:
print('Saved Filepath:', path)